Spatial Analysis of 15K Observation Wells in Germany
Maximilian Noelscher
29 August 2019
The various methods and algorithms of machine learning are explored and tested with the purpose to regionalize specific capacity for the study area of the Lake Chad Basin. Specific capacity was obtained from puming tests at 1387 locations.Various geoscientific parameters are used and tested for training and building the models, such as elevation, river density, lithology, etc. But before getting started, some presettings are necessary to simplify coding and reproducability.
Set the working directory relative to the folder containing the source script.
setwd(substr(
rstudioapi::getActiveDocumentContext()$path,
1,
rev(gregexpr(
"/", rstudioapi::getActiveDocumentContext()$path
)[[1]])[2]
))Often needed parts of the scipt were outsourced to external functions. This simplifies and shortens the code for a more comprehensible reading. The following lines source these functions from the folder called functions with a relative path to the active documents path.
purrr::map(
list.files(
path = './scripts/functions',
pattern = "\\.R$",
full.names = TRUE,
recursive = TRUE
),
source
)Install and load all packages within the brackets of the pacman::p_load() function. Note: Load the tidyverse package last to prevent functions to be masked by other packages.
if (!require("pacman"))
install.packages("pacman")
pacman::p_load(
readxl,
janitor,
ggplot2,
kable,
kableExtra,
png,
FSA,
sp,
strict,
glue,
leaflet,
leaflet.extras,
htmlwidgets,
htmltools,
plotly,
RANN,
plyr,
dplyr,
spdplyr,
rgeos,
rgdal,
raster,
tidyverse
)stammdaten <- readxl::read_excel('./raw_data/Datensatz/Stammdaten_D.xlsx') %>%
as_tibble() %>%
clean_names()
names(stammdaten) <- c('obswell_id', 'name', 'lon', 'lat')Load hydrogeological unit data
# hydr_unit <- readOGR('./raw_data/hyraum_v32/hyraum_gr__v32_poly.shp', encoding = "UTF-8/LATIN-1/...") %>%
# clean_names() %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
#
# hydr_unit <- hydr_unit %>%
# dplyr::mutate(gr_name = str_replace_all(gr_name, c("ü" = "ue", "ö" = "oe", "ä" = "ae", "ß" = "sz"))) %>%
# dplyr::mutate(gr_nr_name = str_replace_all(gr_nr_name, c("ü" = "ue", "ö" = "oe", "ä" = "ae", "ß" = "sz")))
#
# hydr_unit %>% writeOGR('./processed_data/hydr_unit_umlaute_rm.shp', layer = 'hydr_unit_umlaute_rm', driver="ESRI Shapefile")Load hydrogeological unit data
hydr_unit <- readOGR('./processed_data/hydr_unit_umlaute_rm.shp') %>%
clean_names() %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))## OGR data source with driver: ESRI Shapefile
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\processed_data\hydr_unit_umlaute_rm.shp", layer: "hydr_unit_umlaute_rm"
## with 11 features
## It has 5 fields
Data on groundwater pumping stations was obtained from the HAD Thema 7.2 provided by Bundesanstalt für Gewässerkunde (BfG) Source: „Hydrologischer Atlas von Deutschland/BfG, 2000“
Import
public_water_prod_sp <- readOGR('./raw_data/had/off_wassergewinnung.shp')## OGR data source with driver: ESRI Shapefile
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\off_wassergewinnung.shp", layer: "off_wassergewinnung"
## with 1230 features
## It has 33 fields
## Integer64 fields read as strings: TK X_LAMBERT Y_LAMBERT
mining_water_prod_sp <- readOGR('./raw_data/had/bergbau_wassergewinnung.shp')## OGR data source with driver: ESRI Shapefile
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\bergbau_wassergewinnung.shp", layer: "bergbau_wassergewinnung"
## with 379 features
## It has 30 fields
## Integer64 fields read as strings: TK X_LAMBERT Y_LAMBERT
powerplant_water_prod_sp <- readOGR('./raw_data/had/kraftwerke.shp')## OGR data source with driver: ESRI Shapefile
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\kraftwerke.shp", layer: "kraftwerke"
## with 62 features
## It has 31 fields
## Integer64 fields read as strings: TK X_LAMBERT Y_LAMBERT
reservoir_sp <- readOGR('./raw_data/had/talsperre.shp')## OGR data source with driver: ESRI Shapefile
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\talsperre.shp", layer: "talsperre"
## with 64 features
## It has 47 fields
## Integer64 fields read as strings: LFD_NR R_WERT H_WERT XLAMBERT YLAMBERT NR_HAD
First we list all files in the directory
# files_list <- list.files(path = "./raw_data/Datensatz/Messstellen preprocessed/", pattern = "*.txt")Then we apply a custom function to read in all files, bind them and extract the lines with first observation grouped by the wells This implementation to find the start date is very slow, next time it would be worth trying another method.
# start_date <- files_list %>% extract_start_date()Now we ungroup the dataframe
# start_date <- start_date %>% ungroup()
# names(start_date) <- c(names(stammdaten)[1], 'name', 'start_date', 'value')As the previous steps took a few ours, we store the result as .csv
# start_date %>% write_csv2(path = './processed_data/start_date.csv')Read in start_date.csv
start_date <- read_csv2('./processed_data/start_date.csv')Join the 2 dataframes by their common obswell_id
stammdaten_join_startdate <- stammdaten %>%
dplyr::select(-name) %>%
left_join(start_date, by = 'obswell_id')Show example entries
stammdaten_join_startdate %>% headtail()## obswell_id lon lat name start_date
## 1 BB_25470023 808527 5928687 Ottenhagen OP 2000-11-20
## 2 BB_25470024 808530 5928686 Ottenhagen UP 2000-12-18
## 3 BB_25480025 820407 5933409 Neumannshof 2000-11-20
## 13490 TH_5633900114 660999 5581364 Hy Heinersdorf 1_2004 2007-01-08
## 13491 TH_5729240555 616545 5566272 Hy Schweickershausen 1_2002 2007-01-08
## 13492 TH_5730000077 625563 5568125 Br.Lindenau (0006) 2007-01-08
## value
## 1 78.96
## 2 78.78
## 3 34.93
## 13490 429.972
## 13491 348.9
## 13492 281.26
Now we separate the coordinates from the dataframe
coords <- stammdaten_join_startdate %>%
dplyr::select(lon, lat)Show example entries
coords %>% headtail()## lon lat
## 1 808527 5928687
## 2 808530 5928686
## 3 820407 5933409
## 13490 660999 5581364
## 13491 616545 5566272
## 13492 625563 5568125
Delete coordinate columns from dataframe
stammdaten_join_startdate <- stammdaten_join_startdate %>%
dplyr::select(-one_of(colnames(coords)))Show example entries
stammdaten_join_startdate %>% headtail()## obswell_id name start_date value
## 1 BB_25470023 Ottenhagen OP 2000-11-20 78.96
## 2 BB_25470024 Ottenhagen UP 2000-12-18 78.78
## 3 BB_25480025 Neumannshof 2000-11-20 34.93
## 13490 TH_5633900114 Hy Heinersdorf 1_2004 2007-01-08 429.972
## 13491 TH_5729240555 Hy Schweickershausen 1_2002 2007-01-08 348.9
## 13492 TH_5730000077 Br.Lindenau (0006) 2007-01-08 281.26
Now we create a SpatialPointsdataframe from our stammdaten dataframe
obswell_sp <- SpatialPointsDataFrame(coords = coords,
data = stammdaten_join_startdate,
proj4string = CRS('+proj=utm +zone=32 +ellps=GRS80 +units=m +no_defs'))The coodinate reference system needs to be transformed to the one leaflet is expecting
obswell_sp <- obswell_sp %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))We create a color palette from the start dates
pal <- colorNumeric(
palette = "Spectral",
domain = as.numeric(obswell_sp$start_date))We create a leaflet html GIS
map1 <- obswell_sp %>% create_custom_html_leafletmap1()Plot the leaflet GIS
map1Save the leaflet widget
saveWidget(map1, file='map1.html')For the classification of the time series as class anthropogenic or class natural training data is needed The selection of the training data for these two classes is described in the following chapter
Water sources per federal state in Germany
CRS Transformation
public_water_prod_sp <- public_water_prod_sp %>%
spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
mining_water_prod_sp <- mining_water_prod_sp %>%
spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
powerplant_water_prod_sp <- powerplant_water_prod_sp %>%
spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
reservoir_sp <- reservoir_sp %>%
spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))Adding Source
public_water_prod_sp@data <- public_water_prod_sp@data %>%
as_tibble() %>%
dplyr::mutate(source = 'public water production')
mining_water_prod_sp@data <- mining_water_prod_sp@data %>%
as_tibble() %>%
dplyr::mutate(source = 'mining water production')
powerplant_water_prod_sp@data <- powerplant_water_prod_sp@data %>%
as_tibble() %>%
dplyr::mutate(source = 'powerplant water production')
reservoir_sp@data <- reservoir_sp@data %>%
as_tibble() %>%
dplyr::mutate(source = 'reservoir')Specific Manipulation Filter out only types of water production that affect groundwater directly (Grundwasser, Mehrfachentnahme)
mining_water_prod_sp <- mining_water_prod_sp %>%
dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')
public_water_prod_sp <- public_water_prod_sp %>%
dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')
powerplant_water_prod_sp <- powerplant_water_prod_sp %>%
dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')Join Points
water_production_join_sp <- raster::union(mining_water_prod_sp, public_water_prod_sp)
water_production_join_sp <- raster::union(water_production_join_sp, powerplant_water_prod_sp)Add unique IDs
water_production_join_sp <- water_production_join_sp %>%
dplyr::mutate(id_unique = paste0('id_', seq(1,length.out = base::nrow(water_production_join_sp))))Plot in Map
pal2 <- colorFactor(palette = 'Set1', domain = base::as.factor(water_production_join_sp$source) %>% base::unique())
labels <- c('powerplant water production', 'mining water production', 'public water production')Create a leaflet GIS
map2 <- map1 %>% create_custom_html_leafletmap2()Plot the leaflet GIS
map2Save the leaflet widget
saveWidget(map2, file='map2.html')Find nearest neighbors
obswell_sp_coords <- obswell_sp %>%
spTransform(CRS('+proj=utm +zone=32 +ellps=WGS84 +datum=WGS84 +units=m +no_defs')) %>%
coordinates() %>%
as_tibble()
water_production_join_sp_coords <- water_production_join_sp %>%
spTransform(CRS('+proj=utm +zone=32 +ellps=WGS84 +datum=WGS84 +units=m +no_defs')) %>%
coordinates() %>%
as_tibble()
temp_result <- water_production_join_sp_coords %>%
nn2(obswell_sp_coords, k = 1)
nearest_neighbors <- tibble(index = temp_result$nn.idx[,1],
distance = temp_result$nn.dists[,1])
obswell_sp <- obswell_sp %>%
dplyr::mutate(nn_index = nearest_neighbors$index,
nn_distance = nearest_neighbors$distance)Plot the distribution of the distance
nearest_neighbors %>% plot_histogram_custom3()For the selection of observation wells that are antropogenically influenced, we choose wells from cities that use groundwater for drinking purposes or have shallow groundwater table. A shallow groundwater table causes a groundwater abstraction for constructions leading to an anthropogenically influenced groundwater table.
anthropogenic_obswell_m2 <- obswell_sp %>%
dplyr::filter(nn_distance < 1000)
anthropogenic_obswell_m2 %>%
as_tibble() %>%
write_csv2('./processed_data/training_data_anthropogenic_obswell_m2.csv')Plot distribution of the distance of antropogenic training data
anthropogenic_obswell_m2@data %>% ggplot() +
geom_histogram(
aes(nn_distance),
binwidth = 100,
col = 'black',
fill = 'red',
alpha = .5
)natural_obswell_m2 <- obswell_sp %>%
dplyr::arrange(nn_distance) %>%
dplyr::slice(utils::tail(row_number(),
base::nrow(anthropogenic_obswell_m2)))
natural_obswell_m2 %>%
as_tibble() %>%
write_csv2('./processed_data/training_data_natural_obswell_m2.csv')Plot distribution of the distance of natural training data
natural_obswell_m2@data %>% ggplot() +
geom_histogram(
aes(nn_distance),
binwidth = 1000,
col = 'black',
fill = 'red',
alpha = .5
)